plotly.offline.init_notebook_mode()
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
df = pd.read_csv("/Users/tyler/Portfolio/creditcard.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
df.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
# See how the data looks
df.sample(5)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 181360 | 124943.0 | 0.009787 | 0.169569 | 0.059393 | -0.266553 | 1.046478 | 2.039161 | 0.135165 | -0.119449 | -0.950750 | ... | 0.106270 | 1.219889 | -0.246860 | -0.927909 | -0.830785 | 0.312845 | -0.551278 | -0.252516 | 7.81 | 0 |
| 139112 | 83019.0 | 1.044653 | -0.066710 | 0.508579 | 0.462502 | -0.412017 | -0.304318 | -0.044470 | 0.050467 | -0.221082 | ... | -0.131636 | -0.529180 | 0.129618 | 0.240069 | 0.041235 | 0.192747 | -0.036193 | 0.016757 | 64.01 | 0 |
| 162714 | 115313.0 | -1.152682 | 0.352632 | 2.698813 | 4.415090 | 0.185743 | 1.353093 | -0.510100 | 0.666712 | -1.517871 | ... | 0.150948 | 0.166889 | 0.081464 | -0.467486 | -0.018890 | 0.424690 | 0.321357 | 0.172998 | 98.31 | 0 |
| 34598 | 37744.0 | -1.204590 | 0.922388 | 1.160011 | 0.251146 | 1.026198 | -0.932686 | 0.696555 | 0.064930 | -0.425765 | ... | 0.041679 | -0.074861 | -0.333049 | -0.087232 | 0.575269 | -0.385513 | -0.117246 | 0.165928 | 1.00 | 0 |
| 112216 | 72564.0 | -2.467981 | 0.751884 | 0.458208 | -0.328720 | -0.026359 | -1.309760 | -0.052897 | 0.663812 | -0.843688 | ... | 0.219117 | 0.316258 | -0.484254 | 0.620798 | -0.027727 | 0.135159 | -0.267201 | -0.063993 | 12.59 | 0 |
5 rows × 31 columns
# Check NA
print(df.isnull().sum())
Time 0 V1 0 V2 0 V3 0 V4 0 V5 0 V6 0 V7 0 V8 0 V9 0 V10 0 V11 0 V12 0 V13 0 V14 0 V15 0 V16 0 V17 0 V18 0 V19 0 V20 0 V21 0 V22 0 V23 0 V24 0 V25 0 V26 0 V27 0 V28 0 Amount 0 Class 0 dtype: int64
df_Class_count = pd.DataFrame(df["Class"].value_counts()).reset_index()
df_Class_count.columns = ["Class", "Freq"]
fig = px.pie(df_Class_count, values="Freq", names="Class",
title="Not Fraud(0) vs Fraud(1)",
hole=0.4,
width=800,
height=600)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
print("Data is highly imbalanced with respect with target variable 'Class',\
\nsince 0.17% of transaction is fraud in total.")
Data is highly imbalanced with respect with target variable 'Class', since 0.17% of transaction is fraud in total.
X.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
dtype='object')
X = df.iloc[:,0:30]
Y = df.iloc[:,-1]
Random_State = 2023
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=Random_State)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.25, random_state=Random_State)
# 60/20/20 split
rand_clf = RandomForestClassifier(random_state=Random_State)
rand_clf.fit(X_train, Y_train)
RandomForestClassifier(random_state=2023)
pred_clf = rand_clf.predict(X_valid)
tmp_clf = pd.DataFrame({'Feature': X.columns, 'Feature importance': rand_clf.feature_importances_})
tmp_clf = tmp_clf.sort_values(by='Feature importance',ascending=False)
fig = px.bar(tmp_clf, x="Feature", y="Feature importance", color = "Feature",
title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
roc_auc_score(Y_valid.values, pred_clf)
0.8921128951052808
Ada_clf = AdaBoostClassifier(n_estimators=100, random_state=Random_State)
Ada_clf.fit(X_train, Y_train)
AdaBoostClassifier(n_estimators=100, random_state=2023)
tmp_Ada = pd.DataFrame({'Feature': X.columns, 'Feature importance': Ada_clf.feature_importances_})
tmp_Ada = tmp_Ada.sort_values(by='Feature importance',ascending=False)
fig = px.bar(tmp_Ada, x="Feature", y="Feature importance", color = "Feature",
title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
pred_Ada = Ada_clf.predict(X_valid)
roc_auc_score(Y_valid.values, pred_Ada)
0.8919458180739759
Cbc_clf = CatBoostClassifier(iterations = 500,
od_wait=100
,random_seed = Random_State,)
Cbc_clf.fit(X_train, Y_train, verbose=False)
<catboost.core.CatBoostClassifier at 0x7fd598a5b040>
tmp_Cbc = pd.DataFrame({'Feature': X.columns, 'Feature importance': Cbc_clf.feature_importances_})
tmp_Cbc = tmp_Cbc.sort_values(by='Feature importance',ascending=False)
fig = px.bar(tmp_Cbc, x="Feature", y="Feature importance", color = "Feature",
title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
pred_Cbc = Cbc_clf.predict(X_valid)
roc_auc_score(Y_valid.values, pred_Cbc)
0.8970060623616312
dtrain = xgb.DMatrix(X_train, Y_train.values)
dvalid = xgb.DMatrix(X_valid, Y_valid.values)
dtest = xgb.DMatrix(X_test, Y_test.values)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.039
params['silent'] = True
params['max_depth'] = 2
params['subsample'] = 0.8
params['colsample_bytree'] = 0.9
params['eval_metric'] = 'auc'
params['random_state'] = Random_State
model = xgb.train(params,
dtrain,
1000,
watchlist,
early_stopping_rounds=50,
maximize=True,
verbose_eval=50)
[01:50:42] WARNING: /Users/runner/miniforge3/conda-bld/xgboost-split_1667849653518/work/src/learner.cc:767:
Parameters: { "silent" } are not used.
[0] train-auc:0.86326 valid-auc:0.86749
/Users/tyler/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py:617: FutureWarning: Pass `evals` as keyword args.
[50] train-auc:0.92639 valid-auc:0.93594 [100] train-auc:0.93248 valid-auc:0.95022 [150] train-auc:0.96907 valid-auc:0.98326 [200] train-auc:0.98785 valid-auc:0.98955 [250] train-auc:0.99164 valid-auc:0.99156 [300] train-auc:0.99387 valid-auc:0.99233 [350] train-auc:0.99590 valid-auc:0.99297 [400] train-auc:0.99723 valid-auc:0.99225 [403] train-auc:0.99729 valid-auc:0.99212
feature_important = model.get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())
tmp_gbm = pd.DataFrame({'Feature': keys, 'Feature importance': values}).sort_values(by='Feature importance',ascending=False)
fig = px.bar(tmp_gbm, x="Feature", y="Feature importance", color = "Feature",
title="Feature Importance", color_discrete_sequence=px.colors.sequential.Agsunset)
fig.show()
pred_xgb = model.predict(dtest)
roc_auc_score(Y_test.values, pred_xgb)
0.980369660524218
print("This dataset consists of one target variable and thirty explanatory variables.\
\nThe target variable 'Class' has two indicators: 0 and 1, which denote 'Not Fraud transaction' and 'Fraud transaction', respectively.\
\nWe can observe that this dataset is highly imbalanced, with only 0.17% of rows belonging to fraudulent transactions.\
\nThis could be because credit card fraud is a rare occurrence, and cardholders may not always be aware of fraudulent charges on their bill.\
\n\nTo avoid overfitting, the data was split into three sets: train, validation, and test.\
\nFour ensemble methods were implemented: Random Forest classifier, AdaBoost classifier, CatBoost classifier, and XGBoost model.\
\nAdditionally, ROC_AUC scoring was conducted, as it is a suitable method for measuring accuracy in imbalanced data.\
\n\nAlthough the XGBoost model performed with a great score of 0.98, there is still room for improvement,\
\nsuch as over-sampling, under-sampling, applying other ensemble models, etc.")
This dataset consists of one target variable and thirty explanatory variables. The target variable 'Class' has two indicators: 0 and 1, which denote 'Not Fraud transaction' and 'Fraud transaction', respectively. We can observe that this dataset is highly imbalanced, with only 0.17% of rows belonging to fraudulent transactions. This could be because credit card fraud is a rare occurrence, and cardholders may not always be aware of fraudulent charges on their bill. To avoid overfitting, the data was split into three sets: train, validation, and test. Four ensemble methods were implemented: Random Forest classifier, AdaBoost classifier, CatBoost classifier, and XGBoost model. Additionally, ROC_AUC scoring was conducted, as it is a suitable method for measuring accuracy in imbalanced data. Although the XGBoost model performed with a great score of 0.98, there is still room for improvement, such as over-sampling, under-sampling, applying other ensemble models, etc.